knitr::opts_chunk$set(echo = TRUE, include = TRUE,fig.align = "center")
options(scipen = 999, knitr.kable.NA = "")
# Installing libraries (if do not have)
##install.packages("tidyverse")
##install.packages("lubridate")
##install.packages("readxl")
##install.packages("skimr")
##install.packages("magrittr")
##install.packages("tidyquant")
##install.packages("tsibble")
##install.packages("feasts")
##install.packages("ggcorrplot")
##install.packages("glmnet")
##install.packages("caret")
##install.packages("rattle")
# Importing libraries
library(tidyverse)
library(lubridate)
library(readxl)
library(skimr)
library(magrittr)
library(tidyquant)
library(tsibble)
library(feasts)
library(ggcorrplot)
library(glmnet)
library(caret)
library(rattle)
set.seed(1234)
Hourly solar power plan production data is imported from csv file.
df <- read_csv("production_data_with_weather.csv") %>% select(-timestamp)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Date = col_date(format = ""),
## Hour = col_double(),
## Production = col_double(),
## timestamp = col_datetime(format = ""),
## CLOUD_LOW_LAYER_37.75_34.25 = col_double(),
## CLOUD_LOW_LAYER_37.75_34.5 = col_double(),
## CLOUD_LOW_LAYER_38_34.25 = col_double(),
## CLOUD_LOW_LAYER_38_34.5 = col_double(),
## DSWRF_37.75_34.25 = col_double(),
## DSWRF_37.75_34.5 = col_double(),
## DSWRF_38_34.25 = col_double(),
## DSWRF_38_34.5 = col_double(),
## TEMP_37.75_34.25 = col_double(),
## TEMP_37.75_34.5 = col_double(),
## TEMP_38_34.25 = col_double(),
## TEMP_38_34.5 = col_double()
## )
head(df)
## # A tibble: 6 x 15
## Date Hour Production CLOUD_LOW_LAYER… CLOUD_LOW_LAYER… CLOUD_LOW_LAYER…
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-10-09 0 0 0 0 0
## 2 2019-10-09 1 0 0 0 0
## 3 2019-10-09 2 0 0 0 0
## 4 2019-10-09 3 0 0 0 0
## 5 2019-10-09 4 0 0 0 0
## 6 2019-10-09 5 0 0 0 0
## # … with 9 more variables: CLOUD_LOW_LAYER_38_34.5 <dbl>,
## # DSWRF_37.75_34.25 <dbl>, DSWRF_37.75_34.5 <dbl>, DSWRF_38_34.25 <dbl>,
## # DSWRF_38_34.5 <dbl>, TEMP_37.75_34.25 <dbl>, TEMP_37.75_34.5 <dbl>,
## # TEMP_38_34.25 <dbl>, TEMP_38_34.5 <dbl>
Summary statistics of variables are shown.
skim(df)
| Name | df |
| Number of rows | 11592 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| Date | 1 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Date | 0 | 1 | 2019-10-09 | 2021-02-02 | 2020-06-06 | 483 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Hour | 0 | 1 | 11.50 | 6.92 | 0.00 | 5.75 | 11.50 | 17.25 | 23.00 | ▇▇▆▇▇ |
| Production | 0 | 1 | 6.20 | 9.04 | 0.00 | 0.00 | 0.00 | 11.80 | 30.00 | ▇▁▁▁▁ |
| CLOUD_LOW_LAYER_37.75_34.25 | 0 | 1 | 14.74 | 30.05 | 0.00 | 0.00 | 0.00 | 6.00 | 100.00 | ▇▁▁▁▁ |
| CLOUD_LOW_LAYER_37.75_34.5 | 0 | 1 | 15.45 | 30.36 | 0.00 | 0.00 | 0.00 | 10.00 | 100.00 | ▇▁▁▁▁ |
| CLOUD_LOW_LAYER_38_34.25 | 0 | 1 | 16.17 | 31.48 | 0.00 | 0.00 | 0.00 | 11.00 | 100.00 | ▇▁▁▁▁ |
| CLOUD_LOW_LAYER_38_34.5 | 0 | 1 | 16.33 | 31.30 | 0.00 | 0.00 | 0.00 | 13.00 | 100.00 | ▇▁▁▁▁ |
| DSWRF_37.75_34.25 | 0 | 1 | 209.32 | 268.02 | 0.00 | 0.00 | 70.00 | 380.00 | 950.00 | ▇▂▂▁▁ |
| DSWRF_37.75_34.5 | 0 | 1 | 208.64 | 267.75 | 0.00 | 0.00 | 70.00 | 380.00 | 950.00 | ▇▂▂▁▁ |
| DSWRF_38_34.25 | 0 | 1 | 208.81 | 268.09 | 0.00 | 0.00 | 60.00 | 380.00 | 950.00 | ▇▂▂▁▁ |
| DSWRF_38_34.5 | 0 | 1 | 207.94 | 267.06 | 0.00 | 0.00 | 64.00 | 372.75 | 952.00 | ▇▂▂▁▁ |
| TEMP_37.75_34.25 | 0 | 1 | 12.97 | 9.11 | -10.28 | 5.86 | 12.22 | 19.38 | 38.95 | ▁▇▇▅▁ |
| TEMP_37.75_34.5 | 0 | 1 | 12.04 | 9.15 | -12.31 | 4.90 | 11.26 | 18.41 | 37.65 | ▁▇▇▅▂ |
| TEMP_38_34.25 | 0 | 1 | 10.98 | 9.06 | -12.34 | 3.87 | 10.22 | 17.39 | 36.25 | ▁▇▇▅▂ |
| TEMP_38_34.5 | 0 | 1 | 8.99 | 9.05 | -15.47 | 1.75 | 8.15 | 15.55 | 33.65 | ▁▇▇▅▂ |
Checking for which hours of the day there are production
plot(df$Hour, df$Production, xlab = "Hour", ylab = "Production", main = "Hourly Production Plot")
Filtering out the hours without production (Hour < 5 & Hour > 19) from the data
df2 <- df %>% filter(Hour >= 5 & Hour <= 19)
head(df2, 20)
## # A tibble: 20 x 15
## Date Hour Production CLOUD_LOW_LAYER_37.75_34… CLOUD_LOW_LAYER_37.75_…
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2019-10-09 5 0 0 0
## 2 2019-10-09 6 0.04 0 0
## 3 2019-10-09 7 3.7 0 0
## 4 2019-10-09 8 11.2 0 0
## 5 2019-10-09 9 19.8 0 0
## 6 2019-10-09 10 24.9 0 0
## 7 2019-10-09 11 25.7 0 0
## 8 2019-10-09 12 25.7 0 0
## 9 2019-10-09 13 25.7 0 0
## 10 2019-10-09 14 24 0 0
## 11 2019-10-09 15 18.3 0 0
## 12 2019-10-09 16 12.3 0 0
## 13 2019-10-09 17 4.27 0 0
## 14 2019-10-09 18 0.08 0 0
## 15 2019-10-09 19 0 0 0
## 16 2019-10-10 5 0 0 0
## 17 2019-10-10 6 0.04 0 0
## 18 2019-10-10 7 3.63 0 0
## 19 2019-10-10 8 12.5 0 0
## 20 2019-10-10 9 19.9 0 0
## # … with 10 more variables: CLOUD_LOW_LAYER_38_34.25 <dbl>,
## # CLOUD_LOW_LAYER_38_34.5 <dbl>, DSWRF_37.75_34.25 <dbl>,
## # DSWRF_37.75_34.5 <dbl>, DSWRF_38_34.25 <dbl>, DSWRF_38_34.5 <dbl>,
## # TEMP_37.75_34.25 <dbl>, TEMP_37.75_34.5 <dbl>, TEMP_38_34.25 <dbl>,
## # TEMP_38_34.5 <dbl>
Scatter plot, variable distribution and correlation table for target and potential independent variables. We can say that the same variables with different coordinates are highly correlated each other.
GGally::ggpairs(df %>% select(-Date, -Hour))
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
As the same type of variables with different coordinates are so similar to each other in numeric manner and highly correlated, we are calculating their average values to reduce them into one variable. This is done because when modeling with time series, there should not be highly correlated variables in the model.
df3 <- df2 %>%
mutate(CLOUD_LOW_LAYER_AVG = (CLOUD_LOW_LAYER_37.75_34.25 + CLOUD_LOW_LAYER_37.75_34.5 + CLOUD_LOW_LAYER_38_34.25 + CLOUD_LOW_LAYER_38_34.5) / 4,
DSWRF_AVG = (DSWRF_37.75_34.25 + DSWRF_37.75_34.5 + DSWRF_38_34.25 + DSWRF_38_34.5) / 4,
TEMP_AVG = (TEMP_37.75_34.25 + TEMP_37.75_34.5 + TEMP_38_34.25 + TEMP_38_34.5) / 4) %>%
select(Date, Hour, Production, CLOUD_LOW_LAYER_AVG, DSWRF_AVG, TEMP_AVG)
head(df3)
## # A tibble: 6 x 6
## Date Hour Production CLOUD_LOW_LAYER_AVG DSWRF_AVG TEMP_AVG
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-10-09 5 0 0 0 12.7
## 2 2019-10-09 6 0.04 0 0 12.4
## 3 2019-10-09 7 3.7 0 0 12.2
## 4 2019-10-09 8 11.2 0 20 15.1
## 5 2019-10-09 9 19.8 0 62.8 16.7
## 6 2019-10-09 10 24.9 0 475 17.9
Summary statistics of reduced data
skim(df3)
| Name | df3 |
| Number of rows | 7245 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| Date | 1 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Date | 0 | 1 | 2019-10-09 | 2021-02-02 | 2020-06-06 | 483 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Hour | 0 | 1 | 12.00 | 4.32 | 5.00 | 8.00 | 12.00 | 16.00 | 19.00 | ▇▇▇▇▇ |
| Production | 0 | 1 | 9.92 | 9.69 | 0.00 | 0.15 | 6.69 | 19.87 | 30.00 | ▇▂▂▃▂ |
| CLOUD_LOW_LAYER_AVG | 0 | 1 | 15.61 | 29.30 | 0.00 | 0.00 | 0.00 | 14.50 | 100.00 | ▇▁▁▁▁ |
| DSWRF_AVG | 0 | 1 | 310.43 | 285.05 | 0.00 | 20.00 | 260.00 | 525.00 | 945.00 | ▇▃▃▂▂ |
| TEMP_AVG | 0 | 1 | 12.50 | 9.63 | -12.02 | 4.94 | 11.42 | 19.96 | 36.63 | ▁▇▇▆▂ |
Histograms of target and independent variables
ggplot(df3, aes(x=Production)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df3, aes(x=TEMP_AVG)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df3, aes(x=DSWRF_AVG)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df3, aes(x=CLOUD_LOW_LAYER_AVG)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Boxplots of target and independent variables
par(mfrow=c(2,2))
boxplot(df3$Production, main = "Production")
boxplot(df3$TEMP_AVG, main = "Temperature")
boxplot(df3$DSWRF_AVG, main = "DSWRF")
boxplot(df3$CLOUD_LOW_LAYER_AVG, main = "Cloud Low Layer")
par(mfrow=c(1,1))
Scatter plot of Cloud Low Layer vs Production values is shown below. We can observe that higher cloud low layer values causes lower frequency for high production values.
plot(df3$CLOUD_LOW_LAYER_AVG, df3$Production, xlab = "Cloud Low Layer", ylab = "Production", main = "Cloud Low Layer-Production Scatter Plot")
Scatter plot of DSWRF vs Production values is shown below. We can say that DSWRF values are directly proportional to the Production values. As the DSWRF values increases, the Production values also increase.
plot(df3$DSWRF_AVG, df3$Production, xlab = "DSWRF", ylab = "Production", main = "DSWRF-Production Scatter Plot")
Scatter plot of Hour vs DSWRF values is shown below. There is a relationship between DSWRF values and hours similar to the relation of Production-Hour
plot(df3$Hour, df3$DSWRF_AVG , xlab = "Hour", ylab = "DSWRF", main = "Hour-DSWRF Scatter Plot")
Scatter plot of Hour vs Temperature values is shown below. As it is expected, temperature are higher at afternoon hours and lower at morning and evening hours
plot(df3$Hour, df3$TEMP_AVG, xlab = "Hour", ylab = "Temperature", main = "Hour-Temperature Scatter Plot")
Scatter plot of DSWRF vs Temperature values is shown below. It is obvious that low temperature values correspond to low DSWRF values, while the higher DSWRF values, the slightly higher the temperature values.
plot(df3$DSWRF_AVG, df3$TEMP_AVG , xlab = "DSWRF", ylab = "Temperature", main = "DSWRF-Temperature Scatter Plot")
Line plot of Production as a time series for each hour is shown below. We can observe that there is seasonal effect on the production for morning and evening hours as the day hours are shifting along the year.
df3 %>%
ggplot(aes(x = Date, y = Production)) +
geom_line(aes(color = as.factor(Hour)), size = 0.9, show.legend = F) +
labs(title="Line plot of Production for each hour", y="Production Amount", x="Date") +
facet_wrap(~Hour, ncol=4) +
theme_tq()
Line plot of Temperature as a time series for each hour is shown below. We can also observe the same seasonal effect on the temperature variable. Temperature is low at winter and autumn months, while it is higher in summer and spring months (as expected).
df3 %>%
ggplot(aes(x = Date, y = TEMP_AVG)) +
geom_line(aes(color = as.factor(Hour)), size = 0.9, show.legend = F) +
labs(title="Line plot of Temperature for each hour", y="Temperature", x="Date") +
facet_wrap(~Hour, ncol=4) +
theme_tq()
Line plot of DSWRF as a time series for each hour is shown below. We can see the same seasonal effect with temperature variable. However, DSWRF values are almost zero in morning hours.
df3 %>%
ggplot(aes(x = Date, y = DSWRF_AVG)) +
geom_line(aes(color = as.factor(Hour)), size = 0.9, show.legend = F) +
labs(title="Line plot of DSWRF for each hour", y="DSWRF", x="Date") +
facet_wrap(~Hour, ncol=4) +
theme_tq()
Line plot of Cloud Low Layer as a time series for each hour is shown below. We can state that there are very few cloudy days at summer months (as expected).
df3 %>%
ggplot(aes(x = Date, y = CLOUD_LOW_LAYER_AVG)) +
geom_line(aes(color = as.factor(Hour)), size = 0.9, show.legend = F) +
labs(title="Line plot of Cloud Low Layer for each hour", y="Cloud Low Layer", x="Date") +
facet_wrap(~Hour, ncol=4) +
theme_tq()
Finding the largest mean production level for the whole data to use that hour for parameter tuning and use the same parameters for other hours. Hour 11 or 12 can be used for parameter tuning.
df3 %>% group_by(Hour) %>% summarise(mean_production = mean(Production)) %>% arrange(desc(mean_production))
## # A tibble: 15 x 2
## Hour mean_production
## <dbl> <dbl>
## 1 11 20.4
## 2 12 20.4
## 3 13 19.5
## 4 10 18.3
## 5 14 17.7
## 6 15 14.5
## 7 9 13.6
## 8 16 9.20
## 9 8 7.41
## 10 17 3.95
## 11 7 2.37
## 12 18 0.945
## 13 6 0.359
## 14 19 0.0939
## 15 5 0.0364
New variables like lagged variables are created. Daily (lag15) and weekly (lag105) lagged variables created from both target and independent variables.
df4 <- df3 %>%
mutate(Production_lag24 = lag(Production, 15),
TEMP_AVG_lag24 = lag(TEMP_AVG, 15),
DSWRF_AVG_lag24 = lag(DSWRF_AVG, 15),
CLOUD_LOW_LAYER_AVG_lag24 = lag(CLOUD_LOW_LAYER_AVG, 15),
Production_lag105 = lag(Production, 105),
TEMP_AVG_lag105 = lag(TEMP_AVG, 105),
DSWRF_AVG_lag105 = lag(DSWRF_AVG, 105),
CLOUD_LOW_LAYER_AVG_lag105 = lag(CLOUD_LOW_LAYER_AVG, 105)) %>%
filter(Date >= as_date('2019-10-16'))
head(df4)
## # A tibble: 6 x 14
## Date Hour Production CLOUD_LOW_LAYER_AVG DSWRF_AVG TEMP_AVG
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-10-16 5 0 0 0 15.2
## 2 2019-10-16 6 0 0 0 14.8
## 3 2019-10-16 7 0.18 0 0 14.9
## 4 2019-10-16 8 5.14 0 10 17.7
## 5 2019-10-16 9 18.6 0 34.8 19.8
## 6 2019-10-16 10 23.9 0 388. 21.4
## # … with 8 more variables: Production_lag24 <dbl>, TEMP_AVG_lag24 <dbl>,
## # DSWRF_AVG_lag24 <dbl>, CLOUD_LOW_LAYER_AVG_lag24 <dbl>,
## # Production_lag105 <dbl>, TEMP_AVG_lag105 <dbl>, DSWRF_AVG_lag105 <dbl>,
## # CLOUD_LOW_LAYER_AVG_lag105 <dbl>
Train and test data sets are created
df4_train <- df4 %>% filter(Date < as_date('2020-12-01'))
df4_test <- df4 %>% filter(Date >= as_date('2020-12-01') & Date <= as_date('2021-01-31'))
Splitting train data into hourly format to have different models for every hour
df4_train_h5 <- df4_train %>% filter(Hour == 5)
df4_train_h6 <- df4_train %>% filter(Hour == 6)
df4_train_h7 <- df4_train %>% filter(Hour == 7)
df4_train_h8 <- df4_train %>% filter(Hour == 8)
df4_train_h9 <- df4_train %>% filter(Hour == 9)
df4_train_h10 <- df4_train %>% filter(Hour == 10)
df4_train_h11 <- df4_train %>% filter(Hour == 11)
df4_train_h12 <- df4_train %>% filter(Hour == 12)
df4_train_h13 <- df4_train %>% filter(Hour == 13)
df4_train_h14 <- df4_train %>% filter(Hour == 14)
df4_train_h15 <- df4_train %>% filter(Hour == 15)
df4_train_h16 <- df4_train %>% filter(Hour == 16)
df4_train_h17 <- df4_train %>% filter(Hour == 17)
df4_train_h18 <- df4_train %>% filter(Hour == 18)
df4_train_h19 <- df4_train %>% filter(Hour == 19)
Splitting test data into hourly format to have different models for every hour
df4_test_h5 <- df4_test %>% filter(Hour == 5)
df4_test_h6 <- df4_test %>% filter(Hour == 6)
df4_test_h7 <- df4_test %>% filter(Hour == 7)
df4_test_h8 <- df4_test %>% filter(Hour == 8)
df4_test_h9 <- df4_test %>% filter(Hour == 9)
df4_test_h10 <- df4_test %>% filter(Hour == 10)
df4_test_h11 <- df4_test %>% filter(Hour == 11)
df4_test_h12 <- df4_test %>% filter(Hour == 12)
df4_test_h13 <- df4_test %>% filter(Hour == 13)
df4_test_h14 <- df4_test %>% filter(Hour == 14)
df4_test_h15 <- df4_test %>% filter(Hour == 15)
df4_test_h16 <- df4_test %>% filter(Hour == 16)
df4_test_h17 <- df4_test %>% filter(Hour == 17)
df4_test_h18 <- df4_test %>% filter(Hour == 18)
df4_test_h19 <- df4_test %>% filter(Hour == 19)
Different models such as linear regression, decision tree, random forest, GLMNET are tried for data set of Hour 11 to make the parameter tuning on the data.
fitControl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 5)
Linear Regression model with repeated cross validation (fitControl) is created by using caret package (“rpart” method). According to results, intercept parameter is tuned and set to TRUE.
lin_reg_h11 <- train(Production ~ .,
data = df4_train_h11 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneLength = 5)
lin_reg_h11
## Linear Regression
##
## 412 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 330, 329, 329, 330, 330, 330, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 5.22662 0.5116809 3.609332
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lin_reg_h11)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.8165 -1.6674 0.7105 2.6709 18.4428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.041508 1.456740 9.639 < 0.0000000000000002
## CLOUD_LOW_LAYER_AVG -0.068715 0.015076 -4.558 0.00000687140
## DSWRF_AVG 0.026823 0.004497 5.965 0.00000000539
## TEMP_AVG 0.004738 0.112456 0.042 0.9664
## Production_lag24 0.209047 0.049474 4.225 0.00002957204
## TEMP_AVG_lag24 0.018438 0.109055 0.169 0.8658
## DSWRF_AVG_lag24 -0.011216 0.004544 -2.468 0.0140
## CLOUD_LOW_LAYER_AVG_lag24 -0.027582 0.015864 -1.739 0.0829
## Production_lag105 -0.036981 0.048980 -0.755 0.4507
## TEMP_AVG_lag105 0.007737 0.057408 0.135 0.8929
## DSWRF_AVG_lag105 -0.007804 0.004473 -1.745 0.0818
## CLOUD_LOW_LAYER_AVG_lag105 -0.017055 0.015458 -1.103 0.2705
##
## (Intercept) ***
## CLOUD_LOW_LAYER_AVG ***
## DSWRF_AVG ***
## TEMP_AVG
## Production_lag24 ***
## TEMP_AVG_lag24
## DSWRF_AVG_lag24 *
## CLOUD_LOW_LAYER_AVG_lag24 .
## Production_lag105
## TEMP_AVG_lag105
## DSWRF_AVG_lag105 .
## CLOUD_LOW_LAYER_AVG_lag105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.087 on 400 degrees of freedom
## Multiple R-squared: 0.5422, Adjusted R-squared: 0.5296
## F-statistic: 43.06 on 11 and 400 DF, p-value: < 0.00000000000000022
lin_reg_h11$finalModel
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Coefficients:
## (Intercept) CLOUD_LOW_LAYER_AVG
## 14.041508 -0.068715
## DSWRF_AVG TEMP_AVG
## 0.026823 0.004738
## Production_lag24 TEMP_AVG_lag24
## 0.209047 0.018438
## DSWRF_AVG_lag24 CLOUD_LOW_LAYER_AVG_lag24
## -0.011216 -0.027582
## Production_lag105 TEMP_AVG_lag105
## -0.036981 0.007737
## DSWRF_AVG_lag105 CLOUD_LOW_LAYER_AVG_lag105
## -0.007804 -0.017055
Decision Tree model with repeated cross validation (fitControl) is created by using caret package (“rpart” method). According to results, cp parameter is tuned and set to 0.01831669.
dec_tree_h11 <- train(Production ~ .,
data = df4_train_h11 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneLength = 5)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
dec_tree_h11
## CART
##
## 412 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 330, 331, 330, 328, 329, 331, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.01831669 5.502001 0.4651920 3.643034
## 0.02327174 5.510581 0.4597672 3.714958
## 0.05404548 5.567895 0.4432877 3.838971
## 0.10169783 5.905240 0.3742533 4.127225
## 0.39236226 6.703265 0.2895707 4.972411
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01831669.
trellis.par.set(caretTheme())
plot(dec_tree_h11)
fancyRpartPlot(dec_tree_h11$finalModel)
dec_tree_h11$finalModel
## n= 412
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 412 22608.7000 20.513300
## 2) CLOUD_LOW_LAYER_AVG>=26.375 70 3718.8940 10.256860
## 4) DSWRF_AVG< 251.25 35 830.3398 6.078857 *
## 5) DSWRF_AVG>=251.25 35 1666.6560 14.434860 *
## 3) CLOUD_LOW_LAYER_AVG< 26.375 342 10019.0000 22.612570
## 6) DSWRF_AVG< 363.75 62 3572.6530 17.102420
## 12) DSWRF_AVG< 232.5 8 209.0406 9.973750 *
## 13) DSWRF_AVG>=232.5 54 2896.8410 18.158520
## 26) DSWRF_AVG_lag105>=301.25 23 1136.4540 14.335650 *
## 27) DSWRF_AVG_lag105< 301.25 31 1174.8720 20.994840 *
## 7) DSWRF_AVG>=363.75 280 4147.0950 23.832680 *
Random Forest model with repeated cross validation (fitControl) is created by using caret package (“ranger” method). Impurity measure is used as an importance criteria and number of trees parameter is selected as 50 because of performance issues. According to results, mtry parameter is tuned and set to 6, splitrule parameter is tuned and set to “variance”, min.node.size parameter is tuned and set to 5.
rand_forest_h11 <- train(Production ~ .,
data = df4_train_h11 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
importance = "impurity")
rand_forest_h11
## Random Forest
##
## 412 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 332, 329, 329, 329, 329, 329, ...
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 5.264371 0.4989451 3.616634
## 2 extratrees 5.286603 0.4983441 3.685986
## 6 variance 5.227255 0.5068756 3.521374
## 6 extratrees 5.230724 0.5063643 3.545468
## 11 variance 5.296689 0.4943498 3.526574
## 11 extratrees 5.231504 0.5057880 3.536130
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 6, splitrule = variance
## and min.node.size = 5.
plot(rand_forest_h11)
rand_forest_h11$finalModel
## Ranger result
##
## Call:
## ranger::ranger(dependent.variable.name = ".outcome", data = x, mtry = min(param$mtry, ncol(x)), min.node.size = param$min.node.size, splitrule = as.character(param$splitrule), write.forest = TRUE, probability = classProbs, ...)
##
## Type: Regression
## Number of trees: 50
## Sample size: 412
## Number of independent variables: 11
## Mtry: 6
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 29.22577
## R squared (OOB): 0.4687092
GLMNET model with repeated cross validation (fitControl) is created by using caret package (“glmnet” method). According to results, alpha parameter is tuned and set to 0.1, lambda parameter is tuned and set to 0.09524498.
glmnet_h11 <- train(Production ~ .,
data = df4_train_h11 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneLenght= 5)
glmnet_h11
## glmnet
##
## 412 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 328, 329, 330, 330, 331, 330, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.10 0.009524498 5.167968 0.5197991 3.575222
## 0.10 0.095244980 5.156209 0.5211426 3.580631
## 0.10 0.952449803 5.181813 0.5178728 3.686135
## 0.55 0.009524498 5.164148 0.5204057 3.573684
## 0.55 0.095244980 5.154017 0.5211951 3.591338
## 0.55 0.952449803 5.199747 0.5231104 3.745337
## 1.00 0.009524498 5.160886 0.5209047 3.572558
## 1.00 0.095244980 5.161451 0.5197672 3.613407
## 1.00 0.952449803 5.270544 0.5227863 3.854134
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.09524498.
plot(glmnet_h11)
Candidate models for Hour 11 are compared below. It is seen that Random Forest model have almost same R-squared value with other models, however slightly lower (so better) MAE and RMSE values. As a result of these analysis, Random Forest model is selected.
results = resamples(list(Linear_Regression = lin_reg_h11, Decision_Tree = dec_tree_h11, Random_Forest = rand_forest_h11, GLMNET = glmnet_h11))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 2.893254 3.455664 3.628979 3.609332 3.745557 4.304403 0
## Decision_Tree 2.917881 3.369264 3.632743 3.643034 3.844598 4.609441 0
## Random_Forest 2.801116 3.281684 3.476505 3.521374 3.801898 4.137494 0
## GLMNET 2.988826 3.413598 3.559016 3.591338 3.760475 4.150317 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 3.840532 5.026626 5.328700 5.226620 5.629565 6.165916 0
## Decision_Tree 4.405733 5.147553 5.415286 5.502001 5.842309 6.806200 0
## Random_Forest 4.414741 4.713916 5.296077 5.227255 5.641607 6.249548 0
## GLMNET 4.064381 4.720794 5.158619 5.154017 5.442912 6.105409 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.3289091 0.4744179 0.4973486 0.5116809 0.5380840 0.6539699
## Decision_Tree 0.2435324 0.4030340 0.4816605 0.4651920 0.5199176 0.6486876
## Random_Forest 0.3512146 0.4489992 0.4965389 0.5068756 0.5720000 0.6818772
## GLMNET 0.3403778 0.4619531 0.5320585 0.5211951 0.5818343 0.6937941
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Modeling each hour with tuned parameters of all four models to see which one is the best model for different hours. Models for each hour and their comparison results are shown below.
Hour 5
lin_reg_h5 <- train(Production ~ .,
data = df4_train_h5 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
dec_tree_h5 <- train(Production ~ .,
data = df4_train_h5 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h5 <- train(Production ~ .,
data = df4_train_h5 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h5 <- train(Production ~ .,
data = df4_train_h5 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h5, Decision_Tree = dec_tree_h5, Random_Forest = rand_forest_h5, GLMNET = glmnet_h5))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu.
## Linear_Regression 0.06109701 0.07149871 0.16213142 0.19377222 0.26224125
## Decision_Tree 0.02534893 0.04190023 0.05595834 0.07307654 0.08807493
## Random_Forest 0.01172859 0.04033275 0.05104760 0.07127874 0.08871822
## GLMNET 0.04697540 0.05254211 0.05658586 0.12308514 0.17762195
## Max. NA's
## Linear_Regression 0.7202957 0
## Decision_Tree 0.1492681 0
## Random_Forest 0.1426512 0
## GLMNET 0.3413626 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.08099045 0.08565357 1.21231884 1.2301644 1.8363075 5.391001
## Decision_Tree 0.08707441 0.14571653 0.18631673 0.3857114 0.2864111 1.201284
## Random_Forest 0.03166954 0.16942030 0.30825572 0.4457357 0.4347880 1.194677
## GLMNET 0.05790149 0.06623457 0.09284883 0.7194918 1.2165981 2.253347
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu.
## Linear_Regression 0.0003331962 0.01634230 0.02856765 0.07717957 0.1430698
## Decision_Tree 0.0733450666 0.14425400 0.20204982 0.28841956 0.5049438
## Random_Forest 0.0132893289 0.16998356 0.28523050 0.28870035 0.3953266
## GLMNET 0.0004256890 0.02797673 0.07236170 0.09339992 0.1486044
## Max. NA's
## Linear_Regression 0.2788443 0
## Decision_Tree 0.5842820 0
## Random_Forest 0.5738456 0
## GLMNET 0.2478791 0
bwplot(results)
Hour 6
lin_reg_h6 <- train(Production ~ .,
data = df4_train_h6 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h6 <- train(Production ~ .,
data = df4_train_h6 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h6 <- train(Production ~ .,
data = df4_train_h6 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h6 <- train(Production ~ .,
data = df4_train_h6 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h6, Decision_Tree = dec_tree_h6, Random_Forest = rand_forest_h6, GLMNET = glmnet_h6))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.16940398 0.2001016 0.2429741 0.2683884 0.3530160 0.4884026
## Decision_Tree 0.09787429 0.1530611 0.1679170 0.1851597 0.2106784 0.3095159
## Random_Forest 0.08531213 0.1116778 0.1395333 0.1526156 0.1973566 0.2670701
## GLMNET 0.16498923 0.2152129 0.2348562 0.2587064 0.2950568 0.3900661
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.2194157 0.2922248 1.3590824 0.9674428 1.5107835 2.091764
## Decision_Tree 0.1692675 0.2804642 0.3465727 0.5427619 0.4729900 1.423948
## Random_Forest 0.1862462 0.2833799 0.3993544 0.5717088 0.6353569 1.412374
## GLMNET 0.2251155 0.2942282 1.1294281 0.8548998 1.3440668 1.506428
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.0508196 0.2050972 0.2909114 0.4762651 0.7565689 0.8407262
## Decision_Tree 0.2501168 0.6493123 0.7455029 0.6614932 0.8008137 0.9035438
## Random_Forest 0.2499238 0.5635486 0.7105843 0.6449164 0.8186044 0.8887055
## GLMNET 0.1342418 0.2406220 0.3251768 0.4760765 0.7457675 0.8705448
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 7
lin_reg_h7 <- train(Production ~ .,
data = df4_train_h7 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h7 <- train(Production ~ .,
data = df4_train_h7 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h7 <- train(Production ~ .,
data = df4_train_h7 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h7 <- train(Production ~ .,
data = df4_train_h7 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h7, Decision_Tree = dec_tree_h7, Random_Forest = rand_forest_h7, GLMNET = glmnet_h7))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.4971225 0.5596436 0.6225765 0.6345757 0.6548182 0.9677510
## Decision_Tree 0.5983998 0.6627996 0.6976009 0.6959152 0.7296611 0.8363205
## Random_Forest 0.3547400 0.4756874 0.5163212 0.5147180 0.5727098 0.6586124
## GLMNET 0.5215829 0.5953632 0.6677701 0.6681238 0.7344533 0.8144906
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.6691931 0.8429159 0.9569816 1.0283014 1.241142 1.811826
## Decision_Tree 0.7966954 0.9049267 1.0554327 1.0610764 1.167829 1.526170
## Random_Forest 0.5654006 0.7940049 0.9017547 0.9620457 1.105421 1.476581
## GLMNET 0.6974994 0.8209556 0.9923746 1.0467719 1.298232 1.507808
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.5664890 0.8018250 0.8690763 0.8350145 0.8928561 0.9238590
## Decision_Tree 0.6737419 0.7931297 0.8298058 0.8290319 0.8747315 0.8991564
## Random_Forest 0.6978782 0.8016444 0.8745980 0.8563503 0.9047156 0.9593668
## GLMNET 0.6714714 0.7511151 0.8444750 0.8280671 0.9034652 0.9345106
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 8
lin_reg_h8 <- train(Production ~ .,
data = df4_train_h8 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h8 <- train(Production ~ .,
data = df4_train_h8 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h8 <- train(Production ~ .,
data = df4_train_h8 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h8 <- train(Production ~ .,
data = df4_train_h8 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h8, Decision_Tree = dec_tree_h8, Random_Forest = rand_forest_h8, GLMNET = glmnet_h8))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 1.386200 1.466004 1.595995 1.591276 1.677737 1.855410 0
## Decision_Tree 1.423172 1.706762 1.789893 1.819229 1.945229 2.086366 0
## Random_Forest 1.125330 1.437559 1.516934 1.493616 1.569949 1.760493 0
## GLMNET 1.370608 1.488966 1.646669 1.641676 1.795324 1.902613 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 1.802077 1.969944 2.203501 2.219020 2.491602 2.807579 0
## Decision_Tree 2.013645 2.247497 2.497295 2.509986 2.757508 3.057554 0
## Random_Forest 1.639538 2.059600 2.250837 2.193052 2.357741 2.637183 0
## GLMNET 1.809695 2.018951 2.208819 2.252863 2.496700 2.837130 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.7115203 0.7910899 0.8262558 0.8181951 0.8534859 0.8831579
## Decision_Tree 0.6723375 0.7418487 0.7695122 0.7685163 0.7993603 0.8472942
## Random_Forest 0.7522255 0.7888502 0.8180590 0.8218847 0.8514367 0.9045943
## GLMNET 0.7302920 0.7736447 0.8144849 0.8119234 0.8492779 0.8745783
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 9
lin_reg_h9 <- train(Production ~ .,
data = df4_train_h9 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h9 <- train(Production ~ .,
data = df4_train_h9 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h9 <- train(Production ~ .,
data = df4_train_h9 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h9 <- train(Production ~ .,
data = df4_train_h9 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h9, Decision_Tree = dec_tree_h9, Random_Forest = rand_forest_h9, GLMNET = glmnet_h9))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 2.289454 2.511619 2.646099 2.624025 2.715956 2.965946 0
## Decision_Tree 2.215729 2.546226 2.727988 2.734191 2.969713 3.200974 0
## Random_Forest 2.157627 2.439685 2.531193 2.534053 2.633166 2.999513 0
## GLMNET 2.227191 2.409536 2.625687 2.634654 2.834732 3.149887 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 3.142513 3.517448 3.693221 3.653569 3.841738 4.168773 0
## Decision_Tree 3.208230 3.613147 3.975658 3.928788 4.164211 4.821493 0
## Random_Forest 2.933008 3.440202 3.710938 3.679170 3.975709 4.479455 0
## GLMNET 2.850025 3.264149 3.659375 3.637146 3.909733 4.536640 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.5756786 0.6887956 0.7094638 0.7116834 0.7357064 0.7993492
## Decision_Tree 0.4937582 0.6255691 0.6524466 0.6606406 0.7276637 0.7760153
## Random_Forest 0.5641566 0.6495491 0.7127404 0.7028668 0.7559808 0.8243018
## GLMNET 0.5331377 0.6528997 0.7249066 0.7088563 0.7641697 0.8492393
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 10
lin_reg_h10 <- train(Production ~ .,
data = df4_train_h10 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h10 <- train(Production ~ .,
data = df4_train_h10 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h10 <- train(Production ~ .,
data = df4_train_h10 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h10 <- train(Production ~ .,
data = df4_train_h10 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h10, Decision_Tree = dec_tree_h10, Random_Forest = rand_forest_h10, GLMNET = glmnet_h10))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 2.827019 3.138701 3.436751 3.399906 3.610517 4.000010 0
## Decision_Tree 3.114844 3.390721 3.458618 3.523573 3.667151 4.039803 0
## Random_Forest 2.736721 3.123714 3.236087 3.228411 3.331059 3.774466 0
## GLMNET 2.651210 3.045346 3.413147 3.397033 3.658127 4.185256 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 3.845368 4.300051 4.686413 4.684854 5.188596 5.386741 0
## Decision_Tree 4.336040 4.931528 5.136224 5.230371 5.436961 6.127723 0
## Random_Forest 4.023837 4.500532 4.759400 4.775794 5.060122 5.703028 0
## GLMNET 3.799048 4.144273 4.722191 4.648664 4.915853 5.984849 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.4781749 0.5340666 0.6129745 0.6030435 0.6737420 0.7105188
## Decision_Tree 0.3394893 0.4704252 0.5486128 0.5239339 0.5648775 0.6604553
## Random_Forest 0.4852588 0.5571177 0.5851373 0.5911706 0.6407157 0.7318905
## GLMNET 0.3404036 0.5657334 0.6091553 0.6081792 0.6876935 0.7521348
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 11
lin_reg_h11 <- train(Production ~ .,
data = df4_train_h11 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h11 <- train(Production ~ .,
data = df4_train_h11 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h11 <- train(Production ~ .,
data = df4_train_h11 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h11 <- train(Production ~ .,
data = df4_train_h11 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h11, Decision_Tree = dec_tree_h11, Random_Forest = rand_forest_h11, GLMNET = glmnet_h11))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 3.117203 3.378431 3.527296 3.588332 3.649869 4.598174 0
## Decision_Tree 2.909311 3.229981 3.634277 3.597204 3.881523 4.405803 0
## Random_Forest 2.849971 3.288536 3.337324 3.471296 3.742786 4.355053 0
## GLMNET 3.207660 3.449037 3.539508 3.585157 3.739453 4.286480 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 4.325927 4.744098 5.064928 5.177447 5.338782 6.762147 0
## Decision_Tree 4.210169 5.097261 5.397524 5.421982 5.866497 6.429012 0
## Random_Forest 4.178219 4.770878 5.232375 5.164570 5.491026 6.805026 0
## GLMNET 4.317378 4.846473 5.227204 5.173494 5.460170 6.171554 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.2349625 0.4877636 0.5429064 0.5190425 0.5895693 0.6442228
## Decision_Tree 0.2939168 0.3957522 0.4956905 0.4807416 0.5472796 0.7180968
## Random_Forest 0.1775609 0.4675963 0.5175464 0.5193753 0.6141767 0.6775467
## GLMNET 0.3696828 0.4665804 0.5147169 0.5169379 0.5512434 0.6587083
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 12
lin_reg_h12 <- train(Production ~ .,
data = df4_train_h12 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h12 <- train(Production ~ .,
data = df4_train_h12 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h12 <- train(Production ~ .,
data = df4_train_h12 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h12 <- train(Production ~ .,
data = df4_train_h12 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h12, Decision_Tree = dec_tree_h12, Random_Forest = rand_forest_h12, GLMNET = glmnet_h12))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 3.066959 3.421513 3.573793 3.590925 3.753643 4.278373 0
## Decision_Tree 3.182917 3.513272 3.687557 3.704190 3.905639 4.024236 0
## Random_Forest 2.931777 3.247847 3.460217 3.544897 3.767698 4.501917 0
## GLMNET 3.098569 3.364207 3.483382 3.560061 3.800194 3.980713 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 4.242096 4.752972 5.039184 5.115155 5.374922 6.109485 0
## Decision_Tree 4.663445 5.303133 5.590856 5.538428 5.820182 6.131801 0
## Random_Forest 4.281506 4.809154 5.157451 5.236048 5.514498 6.674796 0
## GLMNET 4.367972 4.681305 5.073839 5.078893 5.497301 5.943480 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.3297208 0.4845615 0.5336605 0.5229874 0.5556144 0.6425681
## Decision_Tree 0.2426765 0.4040453 0.4407777 0.4485278 0.5057943 0.5839952
## Random_Forest 0.3030567 0.4142810 0.4938516 0.4928226 0.5760130 0.6557259
## GLMNET 0.3902284 0.4530634 0.5272406 0.5217724 0.5782865 0.6452469
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 13
lin_reg_h13 <- train(Production ~ .,
data = df4_train_h13 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h13 <- train(Production ~ .,
data = df4_train_h13 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h13 <- train(Production ~ .,
data = df4_train_h13 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h13 <- train(Production ~ .,
data = df4_train_h13 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h13, Decision_Tree = dec_tree_h13, Random_Forest = rand_forest_h13, GLMNET = glmnet_h13))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 3.283446 3.678389 3.854194 3.819304 4.005245 4.315248 0
## Decision_Tree 3.378292 3.709032 3.837238 3.910158 4.081288 4.448233 0
## Random_Forest 3.059001 3.526965 3.732022 3.739911 3.981713 4.350794 0
## GLMNET 3.294857 3.660850 3.845855 3.859275 4.086487 4.569363 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 4.686241 5.098131 5.357375 5.412314 5.650593 6.294016 0
## Decision_Tree 5.034172 5.618371 5.761708 5.825576 6.089539 6.707069 0
## Random_Forest 4.489901 5.210717 5.339412 5.468987 5.856344 6.248504 0
## GLMNET 4.493264 5.117803 5.374365 5.438886 5.709759 6.830564 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.3217198 0.4605431 0.5014228 0.4990038 0.5406080 0.6077315
## Decision_Tree 0.1864731 0.3852106 0.4155047 0.4191339 0.4672401 0.5790096
## Random_Forest 0.2846852 0.4355046 0.4765385 0.4825645 0.5367031 0.6348703
## GLMNET 0.2227228 0.4136758 0.5234453 0.4922653 0.5511015 0.6426111
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 14
lin_reg_h14 <- train(Production ~ .,
data = df4_train_h14 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h14 <- train(Production ~ .,
data = df4_train_h14 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h14 <- train(Production ~ .,
data = df4_train_h14 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h14 <- train(Production ~ .,
data = df4_train_h14 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h14, Decision_Tree = dec_tree_h14, Random_Forest = rand_forest_h14, GLMNET = glmnet_h14))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 3.508180 3.924293 4.047812 4.106579 4.339893 4.949461 0
## Decision_Tree 3.130630 3.748720 4.029870 4.086679 4.472295 5.105601 0
## Random_Forest 3.308367 3.616677 3.917309 3.912651 4.099767 4.569188 0
## GLMNET 3.458218 3.937169 4.066736 4.112458 4.402946 4.685981 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 4.705944 5.319636 5.617469 5.596154 5.920488 6.417937 0
## Decision_Tree 4.321025 5.394666 5.688121 5.784243 6.321231 7.268571 0
## Random_Forest 4.780873 5.091839 5.435449 5.450370 5.752966 6.220710 0
## GLMNET 4.534066 5.239388 5.482169 5.567657 5.966554 6.290298 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.2935323 0.3866280 0.4467493 0.4492857 0.5079710 0.5937375
## Decision_Tree 0.1723364 0.3594850 0.4487820 0.4165149 0.4802196 0.6322364
## Random_Forest 0.2765493 0.4379721 0.4707517 0.4695387 0.5184459 0.5960957
## GLMNET 0.2886252 0.4006892 0.4418233 0.4465131 0.5126977 0.5849430
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 15
lin_reg_h15 <- train(Production ~ .,
data = df4_train_h15 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h15 <- train(Production ~ .,
data = df4_train_h15 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h15 <- train(Production ~ .,
data = df4_train_h15 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h15 <- train(Production ~ .,
data = df4_train_h15 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h15, Decision_Tree = dec_tree_h15, Random_Forest = rand_forest_h15, GLMNET = glmnet_h15))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 3.014716 3.304269 3.718817 3.610721 3.787755 4.325422 0
## Decision_Tree 2.775503 3.367507 3.592634 3.674717 3.964539 4.348776 0
## Random_Forest 2.655412 3.269812 3.444362 3.446101 3.623921 3.995736 0
## GLMNET 3.013965 3.358319 3.597367 3.600768 3.918883 4.123056 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 4.213644 4.677034 5.041977 4.955536 5.359908 5.552394 0
## Decision_Tree 4.056718 4.749213 5.171054 5.196175 5.659125 6.364327 0
## Random_Forest 3.621872 4.613620 4.871145 4.865931 5.225792 5.727652 0
## GLMNET 4.107591 4.515084 4.892711 4.922951 5.362670 5.667085 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.3638357 0.4605392 0.4939939 0.5067243 0.5407407 0.6597904
## Decision_Tree 0.3053825 0.3864035 0.4864143 0.4664758 0.5240788 0.6637746
## Random_Forest 0.3714000 0.4607723 0.5025885 0.5229420 0.5848027 0.6823357
## GLMNET 0.3500360 0.4597862 0.5012187 0.5112695 0.5752844 0.6349804
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 16
lin_reg_h16 <- train(Production ~ .,
data = df4_train_h16 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h16 <- train(Production ~ .,
data = df4_train_h16 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h16 <- train(Production ~ .,
data = df4_train_h16 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h16 <- train(Production ~ .,
data = df4_train_h16 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h16, Decision_Tree = dec_tree_h16, Random_Forest = rand_forest_h16, GLMNET = glmnet_h16))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 1.995822 2.419684 2.573412 2.559386 2.707568 3.005746 0
## Decision_Tree 2.126803 2.418821 2.603387 2.604103 2.788904 3.029354 0
## Random_Forest 2.038063 2.255853 2.401145 2.438605 2.614763 2.903027 0
## GLMNET 2.006677 2.395276 2.543759 2.534565 2.712171 2.972613 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 2.826198 3.225076 3.436756 3.437818 3.706063 3.946918 0
## Decision_Tree 2.955545 3.359719 3.492810 3.542564 3.832865 4.329442 0
## Random_Forest 2.854902 3.150525 3.339039 3.381359 3.667249 3.977621 0
## GLMNET 2.732919 3.215857 3.449467 3.413480 3.600293 3.974472 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.5528480 0.6289683 0.6632161 0.6643475 0.7024180 0.7815984
## Decision_Tree 0.5000898 0.5988085 0.6596745 0.6404629 0.6795656 0.7541845
## Random_Forest 0.5534123 0.6195876 0.6967533 0.6725992 0.7128051 0.7760320
## GLMNET 0.5469153 0.6370736 0.6684323 0.6676564 0.7013827 0.7893644
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 17
lin_reg_h17 <- train(Production ~ .,
data = df4_train_h17 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h17 <- train(Production ~ .,
data = df4_train_h17 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h17 <- train(Production ~ .,
data = df4_train_h17 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h17 <- train(Production ~ .,
data = df4_train_h17 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h17, Decision_Tree = dec_tree_h17, Random_Forest = rand_forest_h17, GLMNET = glmnet_h17))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 1.075375 1.272980 1.361687 1.349141 1.453807 1.689689 0
## Decision_Tree 1.232630 1.400354 1.497297 1.493019 1.611589 1.711721 0
## Random_Forest 1.081176 1.192849 1.303859 1.310273 1.423572 1.545252 0
## GLMNET 1.094692 1.225841 1.396720 1.356017 1.440666 1.678798 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Linear_Regression 1.556822 1.870488 2.048242 2.025076 2.179897 2.623589 0
## Decision_Tree 1.682365 2.079345 2.284816 2.228391 2.420291 2.697800 0
## Random_Forest 1.621155 1.851115 2.096050 2.062672 2.247884 2.406862 0
## GLMNET 1.634215 1.797872 2.048909 2.013617 2.136650 2.433957 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.6169692 0.7242731 0.7503258 0.7618300 0.7916721 0.8704356
## Decision_Tree 0.6183650 0.6657735 0.7005302 0.7111222 0.7550302 0.8306167
## Random_Forest 0.6666678 0.7204831 0.7433597 0.7535570 0.7991615 0.8413293
## GLMNET 0.6540482 0.7357092 0.7545931 0.7638747 0.8098491 0.8575323
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 18
lin_reg_h18 <- train(Production ~ .,
data = df4_train_h18 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h18 <- train(Production ~ .,
data = df4_train_h18 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h18 <- train(Production ~ .,
data = df4_train_h18 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h18 <- train(Production ~ .,
data = df4_train_h18 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h18, Decision_Tree = dec_tree_h18, Random_Forest = rand_forest_h18, GLMNET = glmnet_h18))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.3849398 0.4384068 0.4608148 0.4638871 0.4868914 0.5690345
## Decision_Tree 0.3409060 0.4155545 0.4368887 0.4557123 0.4949975 0.6299430
## Random_Forest 0.2465634 0.3312156 0.3483535 0.3628731 0.3904498 0.5107207
## GLMNET 0.3866043 0.4389573 0.4631879 0.4643945 0.4866759 0.5406999
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.5683505 0.6480250 0.7070179 0.7145571 0.7677800 0.8857574
## Decision_Tree 0.5669806 0.7002014 0.7505882 0.7832582 0.8673409 1.0859657
## Random_Forest 0.4755392 0.6367056 0.6798621 0.6887588 0.7391388 0.9532257
## GLMNET 0.5564430 0.6447261 0.7202300 0.7040448 0.7554512 0.8456627
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.6514716 0.7455537 0.7683856 0.7680803 0.8137536 0.8563431
## Decision_Tree 0.5543311 0.6708407 0.7266006 0.7199871 0.7776893 0.8606548
## Random_Forest 0.6362266 0.7480361 0.8078318 0.7854434 0.8192655 0.8890177
## GLMNET 0.6617434 0.7375800 0.7703730 0.7733320 0.8269172 0.8513065
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
Hour 19
lin_reg_h19 <- train(Production ~ .,
data = df4_train_h19 %>% select(-Date, -Hour),
method = "lm",
trControl = fitControl,
tuneGrid = expand.grid(intercept = T),
tuneLength = 5)
dec_tree_h19 <- train(Production ~ .,
data = df4_train_h19 %>% select(-Date, -Hour),
method = "rpart",
trControl = fitControl,
tuneGrid = expand.grid(cp = 0.01831669),
tuneLength = 5)
rand_forest_h19 <- train(Production ~ .,
data = df4_train_h19 %>% select(-Date, -Hour),
method = "ranger",
trControl = fitControl,
num.trees = 50,
tuneGrid = expand.grid(mtry = 6, splitrule = "variance", min.node.size = 5),
importance = "impurity")
glmnet_h19 <- train(Production ~ .,
data = df4_train_h19 %>% select(-Date, -Hour),
method = "glmnet",
trControl = fitControl,
tuneGrid = expand.grid(alpha = 0.1, lambda = 0.09524498),
tuneLenght= 5)
results = resamples(list(Linear_Regression = lin_reg_h19, Decision_Tree = dec_tree_h19, Random_Forest = rand_forest_h19, GLMNET = glmnet_h19))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Linear_Regression, Decision_Tree, Random_Forest, GLMNET
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu.
## Linear_Regression 0.11377618 0.13393312 0.14801816 0.1513910 0.1729970
## Decision_Tree 0.08023985 0.10338146 0.11422675 0.1232364 0.1301680
## Random_Forest 0.07622747 0.09059199 0.09913574 0.1045346 0.1135062
## GLMNET 0.10461614 0.12224866 0.13046102 0.1364589 0.1539398
## Max. NA's
## Linear_Regression 0.1959930 0
## Decision_Tree 0.2212080 0
## Random_Forest 0.1515046 0
## GLMNET 0.1789626 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.1535314 0.1841391 0.2256384 0.2901539 0.3428669 0.5888926
## Decision_Tree 0.1781888 0.2233287 0.2621926 0.3143660 0.3232364 0.6067362
## Random_Forest 0.1689211 0.1928223 0.2303002 0.2832471 0.2849836 0.5778092
## GLMNET 0.1452817 0.1850964 0.2338504 0.2874187 0.3001882 0.5518154
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Linear_Regression 0.06116251 0.1482205 0.2706640 0.2388334 0.3034867 0.3949668
## Decision_Tree 0.01650860 0.1092602 0.1639303 0.1872464 0.2333479 0.6517671
## Random_Forest 0.09916785 0.1838025 0.2741495 0.2900228 0.3910371 0.5452759
## GLMNET 0.07586490 0.1812867 0.2792879 0.2570271 0.3281074 0.4014769
## NA's
## Linear_Regression 0
## Decision_Tree 0
## Random_Forest 0
## GLMNET 0
bwplot(results)
As Random Forest model is generally the best one according to the model comparison results among hourly models, we choose to continue with Random Forest model to make the predictions on the test set. Below the predictions are done.
rf_h5_pred_test <- predict(rand_forest_h5$finalModel, data = df4_test_h5)
rf_h6_pred_test <- predict(rand_forest_h6$finalModel, data = df4_test_h6)
rf_h7_pred_test <- predict(rand_forest_h7$finalModel, data = df4_test_h7)
rf_h8_pred_test <- predict(rand_forest_h8$finalModel, data = df4_test_h8)
rf_h9_pred_test <- predict(rand_forest_h9$finalModel, data = df4_test_h9)
rf_h10_pred_test <- predict(rand_forest_h10$finalModel, data = df4_test_h10)
rf_h11_pred_test <- predict(rand_forest_h11$finalModel, data = df4_test_h11)
rf_h12_pred_test <- predict(rand_forest_h12$finalModel, data = df4_test_h12)
rf_h13_pred_test <- predict(rand_forest_h13$finalModel, data = df4_test_h13)
rf_h14_pred_test <- predict(rand_forest_h14$finalModel, data = df4_test_h14)
rf_h15_pred_test <- predict(rand_forest_h15$finalModel, data = df4_test_h15)
rf_h16_pred_test <- predict(rand_forest_h16$finalModel, data = df4_test_h16)
rf_h17_pred_test <- predict(rand_forest_h17$finalModel, data = df4_test_h17)
rf_h18_pred_test <- predict(rand_forest_h18$finalModel, data = df4_test_h18)
rf_h19_pred_test <- predict(rand_forest_h19$finalModel, data = df4_test_h19)
Test set predictions are binded with test data set for each hour.
df4_test_h5_w_pred <- df4_test_h5 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h5_pred_test$predictions)
df4_test_h6_w_pred <- df4_test_h6 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h6_pred_test$predictions)
df4_test_h7_w_pred <- df4_test_h7 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h7_pred_test$predictions)
df4_test_h8_w_pred <- df4_test_h8 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h8_pred_test$predictions)
df4_test_h9_w_pred <- df4_test_h9 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h9_pred_test$predictions)
df4_test_h10_w_pred <- df4_test_h10 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h10_pred_test$predictions)
df4_test_h11_w_pred <- df4_test_h11 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h11_pred_test$predictions)
df4_test_h12_w_pred <- df4_test_h12 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h12_pred_test$predictions)
df4_test_h13_w_pred <- df4_test_h13 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h13_pred_test$predictions)
df4_test_h14_w_pred <- df4_test_h14 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h14_pred_test$predictions)
df4_test_h15_w_pred <- df4_test_h15 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h15_pred_test$predictions)
df4_test_h16_w_pred <- df4_test_h16 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h16_pred_test$predictions)
df4_test_h17_w_pred <- df4_test_h17 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h17_pred_test$predictions)
df4_test_h18_w_pred <- df4_test_h18 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h18_pred_test$predictions)
df4_test_h19_w_pred <- df4_test_h19 %>% select(Date, Hour, Production) %>% mutate(Prediction = rf_h19_pred_test$predictions)
Test sets for each hour with predictions are row binded. Thus, we create one test set with prediction for whole data.
df4_test_w_pred <- rbind(df4_test_h5_w_pred, df4_test_h6_w_pred, df4_test_h7_w_pred, df4_test_h8_w_pred,
df4_test_h9_w_pred, df4_test_h10_w_pred, df4_test_h11_w_pred, df4_test_h12_w_pred,
df4_test_h13_w_pred, df4_test_h14_w_pred, df4_test_h15_w_pred, df4_test_h16_w_pred,
df4_test_h17_w_pred, df4_test_h18_w_pred, df4_test_h19_w_pred) %>% arrange(Date, Hour)
head(df4_test_w_pred)
## # A tibble: 6 x 4
## Date Hour Production Prediction
## <date> <dbl> <dbl> <dbl>
## 1 2020-12-01 5 0 0
## 2 2020-12-01 6 0 0
## 3 2020-12-01 7 0 0.0478
## 4 2020-12-01 8 1.51 2.91
## 5 2020-12-01 9 5.29 5.67
## 6 2020-12-01 10 19.1 10.5
skim(df4_test_w_pred %>% select(Production, Prediction))
| Name | df4_test_w_pred %>% selec… |
| Number of rows | 930 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Production | 0 | 1 | 7.61 | 9.21 | 0 | 0.00 | 2.60 | 14.42 | 30.00 | ▇▂▂▂▁ |
| Prediction | 0 | 1 | 6.20 | 6.75 | 0 | 0.01 | 2.97 | 11.23 | 23.08 | ▇▂▂▂▁ |
Prediction and residual analysis for test set of random forest model is shown. Actual vs Predicted plot shows that predicted values are generally close to the actual values. There are few observations which are too much deviated. Histogram of residuals shows normal distribution around mean value of 0 which is good for the model. Finally, predicted vs residuals plot shows that predicted values are distributed around 0 which is also good for the model even if there some few deviated values, they are not so important and not so much.
plot(df4_test_w_pred$Prediction, df4_test_w_pred$Production, xlab = "Predicted", ylab = "Actual", main = "Actual vs Predicted Plot for Random Forest Model with Test Set")
abline(a=0,b=1,col='red', lty = 2)
rf_residuals_test <- df4_test_w_pred$Production - df4_test_w_pred$Prediction
hist(rf_residuals_test, xlab = "Residuals", main = "Residuals Histogram of Random Forest Model")
plot(df4_test_w_pred$Prediction, rf_residuals_test, xlab = "Predicted", ylab = "Residuals", main = "Predicted vs Residuals Plot for Random Forest Model with Test Set")
abline(h = 0, col = "red", lty = 2)
Lastly, RMSE values of test set predictions are shown. It is not a high value which is also good for the model.
RMSE(df4_test_w_pred$Production, df4_test_w_pred$Prediction)
## [1] 4.666542